package com.tensquare.usercrawler.processor;

import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * 用户处理类
 */
@Component
public class UserProceesor implements PageProcessor{


    @Override
    public void process(Page page) {
        //1.发现新连接
        page.addTargetRequests(page.getHtml().regex("https://blog.csdn.net/[a-zA-Z0-9_]+/article/details/[0-9]{8}").all());

        //2.获取用户昵称和头像
        //昵称
        String nickname = page.getHtml().xpath("//*[@id=\"uid\"]/text()").toString();
        //头像
        /**
         * css(elename,src): 获取某个标签的子标签的属性值
         */
        String avartar = page.getHtml().xpath("//*[@id=\"asideProfile\"]/div[1]/div[1]/a").css("img","src").toString();

        if(nickname!=null && avartar!=null){
            page.putField("nickname",nickname);
            page.putField("avartar",avartar);
        }else{
            page.setSkip(true);
        }
    }

    @Override
    public Site getSite() {
        return Site.me()
                .setSleepTime(1000)
                .setTimeOut(1000)
                .setRetryTimes(3);
    }
}
